import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd

import plotly
import plotly.graph_objects as go
import plotly.express as px

from plotly.offline import plot, iplot, init_notebook_mode


df = pd.read_csv('steam.csv')
df.head()
##    appid                       name  ...             owners  price
## 0     10             Counter-Strike  ...  10000000-20000000   7.19
## 1     20      Team Fortress Classic  ...   5000000-10000000   3.99
## 2     30              Day of Defeat  ...   5000000-10000000   3.99
## 3     40         Deathmatch Classic  ...   5000000-10000000   3.99
## 4     50  Half-Life: Opposing Force  ...   5000000-10000000   3.99
## 
## [5 rows x 18 columns]
df['appid'].sample()
## 7044    409520
## Name: appid, dtype: int64
df.isnull().sum()
## appid               0
## name                0
## release_date        0
## english             0
## developer           0
## publisher           0
## platforms           0
## required_age        0
## categories          0
## genres              0
## steamspy_tags       0
## achievements        0
## positive_ratings    0
## negative_ratings    0
## average_playtime    0
## median_playtime     0
## owners              0
## price               0
## dtype: int64
df['appid'].sample()
## 3677    315920
## Name: appid, dtype: int64
df.drop(columns='appid', inplace=True)
df['english'].unique() 
## array([1, 0], dtype=int64)
df['english'].unique() 
## array([1, 0], dtype=int64)
map_dict = {0: 'non-English', 1: 'English'}

df['english'] = df['english'].map(map_dict)
fig = go.Figure(data=[go.Pie(labels=df['english'].value_counts().index, 
                             values=df['english'].value_counts().values)])

fig.update_traces(textinfo='value', textfont_size=20,
                  marker=dict(colors=['salmon', 'lightblue'], 
                  line=dict(color='#000000', width=2)))

fig.update_layout(
    height=600, width=600, title_text='English and not English pie chart',
    xaxis_title='number of songs', yaxis_title='artist', title_x = 0.5,
    
    font=dict(
            family="Courier New, monospace",
            size=18,
            color="black"),
    
    legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.2,
            xanchor="right",
            x=1)
)

fig.show()
import plotly.express as px
#df = px.data.tips()
fig = px.pie(df, values=df['english'].value_counts().values, names=df['english'].value_counts().index,
title='Jogos com idioma ingles',
color_discrete_map={'Thur':'lightcyan'})
fig.update_traces( textinfo='percent+label')
#fig.show()

temos que 98.1% dos games possui idioma ingles e apenas 1,89% nao possui este idioma

verificar a quantidade de proprietarios



df['owners'].value_counts().tail()
## 5000000-10000000       46
## 10000000-20000000      21
## 20000000-50000000       3
## 50000000-100000000      2
## 100000000-200000000     1
## Name: owners, dtype: int64

apenas 1 dos jogos possuem mais de cem milhões de proprietários(jogadores)

top_6_owners = df[
            (df['owners'] == '20000000-50000000') | 
            (df['owners'] == '50000000-100000000') | 
            (df['owners'] == '100000000-200000000')
          ].sort_values(by='owners')
          
print(top_6_owners)
          
##                                    name  ...  price
## 22                               Dota 2  ...   0.00
## 19                      Team Fortress 2  ...   0.00
## 1634                           Warframe  ...   0.00
## 3362                           Unturned  ...   0.00
## 25     Counter-Strike: Global Offensive  ...   0.00
## 12836     PLAYERUNKNOWN'S BATTLEGROUNDS  ...  26.99
## 
## [6 rows x 17 columns]

perceba que dota 2 é o jogo mais adquirido ou jogado na steam, um dos motivos pode ser seu preço(gratís).

df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date'].head()
## 0   2000-11-01
## 1   1999-04-01
## 2   2003-05-01
## 3   2001-06-01
## 4   1999-11-01
## Name: release_date, dtype: datetime64[ns]

vamos verificar a quantiddade de jogos grátis


free, not_free = df[df['price'] == 0].shape[0], df[df['price'] != 0].shape[0]

labels = ['free', 'not free']

fig = px.pie(df, values=[free, not_free], names=labels,
title='relação de jogos gratuitos',
color_discrete_map={'Thur':'lightcyan'})
fig.update_traces( textinfo='percent+label')

          

apenas 9,46% dos games disponíveis são gratuitos

Vamos ver quantos jogos no Steam oferecem jogabilidade multiplayer e quantos jogos não oferecem esse recurso

import plotly.express as px

df['multiplayer'] = df['categories'].apply(lambda x: 'multi-player' in x.lower())
df.drop(columns='categories')
##                              name release_date  ... price multiplayer
## 0                  Counter-Strike   2000-11-01  ...  7.19        True
## 1           Team Fortress Classic   1999-04-01  ...  3.99        True
## 2                   Day of Defeat   2003-05-01  ...  3.99        True
## 3              Deathmatch Classic   2001-06-01  ...  3.99        True
## 4       Half-Life: Opposing Force   1999-11-01  ...  3.99        True
## ...                           ...          ...  ...   ...         ...
## 27070             Room of Pandora   2019-04-24  ...  2.09       False
## 27071                   Cyber Gun   2019-04-23  ...  1.69       False
## 27072            Super Star Blast   2019-04-24  ...  3.99        True
## 27073  New Yankee 7: Deer Hunters   2019-04-17  ...  5.19       False
## 27074                   Rune Lord   2019-04-24  ...  5.19       False
## 
## [27075 rows x 17 columns]
valor=df['multiplayer'].value_counts().values



fig = px.bar(df, x=['No-multiplayer','Multiplayer'], y= valor, title="Wide-Form Input")
fig.show()

fazendo utilizando a linguagem R

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

carregando o banco de dados para a linguagem R

dados <-read.csv("steam.csv")

verificando se temos valores ausentes

any(is.na(dados))
## [1] FALSE

como o vaalor retornado foi null não temos dados faltantes

glimpse(dados)
## Rows: 27,075
## Columns: 18
## $ appid            <int> 10, 20, 30, 40, 50, 60, 70, 80, 130, 220, 240, 280, 3~
## $ name             <chr> "Counter-Strike", "Team Fortress Classic", "Day of De~
## $ release_date     <chr> "2000-11-01", "1999-04-01", "2003-05-01", "2001-06-01~
## $ english          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ developer        <chr> "Valve", "Valve", "Valve", "Valve", "Gearbox Software~
## $ publisher        <chr> "Valve", "Valve", "Valve", "Valve", "Valve", "Valve",~
## $ platforms        <chr> "windows;mac;linux", "windows;mac;linux", "windows;ma~
## $ required_age     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ categories       <chr> "Multi-player;Online Multi-Player;Local Multi-Player;~
## $ genres           <chr> "Action", "Action", "Action", "Action", "Action", "Ac~
## $ steamspy_tags    <chr> "Action;FPS;Multiplayer", "Action;FPS;Multiplayer", "~
## $ achievements     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 33, 147, 0, 54, 0, 0, 0, 1~
## $ positive_ratings <int> 124534, 3318, 3416, 1273, 5250, 2758, 27755, 12120, 3~
## $ negative_ratings <int> 3339, 633, 398, 267, 288, 684, 1100, 1439, 420, 2419,~
## $ average_playtime <int> 17612, 277, 187, 258, 624, 175, 1300, 427, 361, 691, ~
## $ median_playtime  <int> 317, 62, 34, 184, 415, 10, 83, 43, 205, 402, 400, 214~
## $ owners           <chr> "10000000-20000000", "5000000-10000000", "5000000-100~
## $ price            <dbl> 7.19, 3.99, 3.99, 3.99, 3.99, 3.99, 7.19, 7.19, 3.99,~
#install.packages("psych")
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(dados) 
##                  vars     n      mean        sd    median   trimmed       mad
## appid               1 27075 596203.51 250894.17 599070.00 602523.30 294770.53
## name*               2 27075  13515.04   7804.05  13515.00  13514.70  10020.89
## release_date*       3 27075   1859.53    564.28   1995.00   1923.12    530.77
## english             4 27075      0.98      0.14      1.00      1.00      0.00
## developer*          5 27075   8524.68   4943.50   8462.00   8510.99   6354.42
## publisher*          6 27075   7036.98   4196.98   6920.00   7013.10   5454.49
## platforms*          7 27075      4.79      1.20      4.00      4.61      0.00
## required_age        8 27075      0.35      2.41      0.00      0.00      0.00
## categories*         9 27075   2021.24   1106.18   2762.00   2066.67    676.07
## genres*            10 27075    596.50    372.53    551.00    577.96    391.41
## steamspy_tags*     11 27075   2535.22   1824.15   2213.00   2418.19   2001.51
## achievements       12 27075     45.25    352.67      7.00     10.89     10.38
## positive_ratings   13 27075   1000.56  18988.72     24.00     79.30     32.62
## negative_ratings   14 27075    211.03   4284.94      9.00     24.56     11.86
## average_playtime   15 27075    149.80   1827.04      0.00     17.02      0.00
## median_playtime    16 27075    146.06   2353.88      0.00     17.51      0.00
## owners*            17 27075      2.75      3.02      1.00      2.08      0.00
## price              18 27075      6.08      7.87      3.99      4.85      4.45
##                  min        max      range   skew kurtosis      se
## appid             10 1069460.00 1069450.00  -0.22    -0.65 1524.78
## name*              1   27033.00   27032.00   0.00    -1.20   47.43
## release_date*      1    2619.00    2618.00  -0.99     0.61    3.43
## english            0       1.00       1.00  -7.07    48.00    0.00
## developer*         1   17113.00   17112.00   0.02    -1.21   30.04
## publisher*         1   14354.00   14353.00   0.03    -1.24   25.51
## platforms*         1       7.00       6.00   1.00    -0.78    0.01
## required_age       0      18.00      18.00   6.75    44.09    0.01
## categories*        1    3333.00    3332.00  -0.36    -1.67    6.72
## genres*            1    1552.00    1551.00   0.37    -0.61    2.26
## steamspy_tags*     1    6423.00    6422.00   0.40    -0.97   11.09
## achievements       0    9821.00    9821.00  13.43   191.14    2.14
## positive_ratings   0 2644404.00 2644404.00 106.10 14083.85  115.40
## negative_ratings   0  487076.00  487076.00  88.63  9122.52   26.04
## average_playtime   0  190625.00  190625.00  59.52  5081.15   11.10
## median_playtime    0  190625.00  190625.00  64.36  4893.18   14.31
## owners*            1      13.00      12.00   1.49     0.79    0.02
## price              0     421.99     421.99  11.73   420.71    0.05
library("gt")

#dados %>% gt()

aparentimente a coluna a appid é apenas um identificador(id),não vamos utiliza-lá

dados<-dados[-1]
table(dados$english)
## 
##     0     1 
##   511 26564

asumandindo 0 como não ingles e 1 como ingles temos :

library(plotly)


categoria<-c("no inglesh","inglesh")

dados$inglesf<-factor(c(dados$english), levels = c("0","1"))

p <- dados %>%
  group_by(english) %>%
  summarize(count = n()) %>%
  plot_ly(labels = ~english, values = ~count) %>%
  add_pie(hole = 0) %>%
  layout(title = "Jogos com idioma ingles",  showlegend = F,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = TRUE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = TRUE))
p